Subgroup Identification Analysis

GBSG Example

Author

Larry Leon

Code
rm(list=ls())
# Record timing
# Start timer
t_start <- proc.time()

# Set options
knitr::opts_chunk$set(
  echo = TRUE,
  warning = FALSE,
  message = FALSE,
  fig.align = 'center',
  fig.retina = 2
)
library(tinytex)
library(ggplot2)
library(gt)

library(survival)
library(data.table)
library(randomForest)
library(grf)
library(policytree)
library(DiagrammeR)

# library(devtools)
# install_github("larry-leon/forestsearch", force = TRUE)

library(forestsearch)
library(weightedsurv)

# Set theme for plots
theme_set(theme_minimal(base_size = 12))

1 Summary

Reproducing main GBSG analysis

1.1 Datasetup

Code
df.analysis <- gbsg
df.analysis <- within(df.analysis,{
id <- as.numeric(c(1:nrow(df.analysis)))  
# time to months
time_months <- rfstime/30.4375
grade3 <- ifelse(grade=="3",1,0)
treat <- hormon
})
confounders.name <- c("age","meno","size","grade3","nodes","pgr","er")
outcome.name <- c("time_months")
event.name <- c("status")
id.name <- c("id")
treat.name <- c("hormon")

1.2 Kaplan-Meier curves and baseline summary

Code
dfcount <- df_counting(
  df = df.analysis,
  by.risk = 6,
  tte.name = outcome.name, 
  event.name = event.name, 
  treat.name = treat.name
)
plot_weighted_km(dfcount, conf.int = TRUE, show.logrank = TRUE, ymax = 1.05, xmed.fraction = 0.775, ymed.offset = 0.125)

Code
create_summary_table(data = df.analysis, treat_var = treat.name, 
                     table_title = "GBSG Characteristics by Treatment Arm",
                                      vars_continuous=c("age","nodes","size","er","pgr"),
                                      vars_categorical=c("grade","grade3"),
                                      font_size = 12)
GBSG Characteristics by Treatment Arm
Characteristic Control (n=440) Treatment (n=246) P-value1 SMD2
age Mean (SD) 51.1 (10.0) 56.6 (9.4) <0.001 0.57
nodes Mean (SD) 4.9 (5.6) 5.1 (5.3) 0.665 0.03
size Mean (SD) 29.6 (14.4) 28.8 (14.1) 0.470 0.06
er Mean (SD) 79.7 (124.2) 125.8 (191.1) <0.001 0.30
pgr Mean (SD) 102.0 (170.0) 124.3 (249.7) 0.213 0.11
grade 0.273 0.06
1 48 (10.9%) 33 (13.4%)
2 281 (63.9%) 163 (66.3%)
3 111 (25.2%) 50 (20.3%)
grade3 0.174 0.05
0 329 (74.8%) 196 (79.7%)
1 111 (25.2%) 50 (20.3%)
1 P-values: t-test for continuous, chi-square/Fisher's exact for categorical/binary variables
2 SMD = Standardized mean difference (Cohen's d for continuous, Cramer's V for categorical)

1.3 GRF analysis

Code
## GRF
grf_est1 <- grf.subg.harm.survival(data=df.analysis,
confounders.name = confounders.name,
outcome.name=outcome.name, event.name=event.name, id.name=id.name, treat.name=treat.name,
maxdepth = 2, n.min = 60, dmin.grf = 12, frac.tau=0.6, details=TRUE)
tau, maxdepth = 46.75811 2 
   leaf.node control.mean control.size control.se depth
1          2         6.49        82.00       3.34     1
2          3        -4.10       604.00       1.06     1
11         4        -7.90       112.00       2.81     2
21         5         3.86       177.00       1.87     2
4          7        -5.89       356.00       1.33     2

Selected subgroup:
  leaf.node control.mean control.size control.se depth
1         2         6.49        82.00       3.34     1

GRF subgroup found
Terminating node at max.diff (sg.harm.id):
[1] "er <= 0"

All splits:
[1] "er <= 0"   "age <= 50" "age <= 43"
Code
# NOTE: In general for GRF trees
# leaf1 --> recommend control
# leaf2 --> recommend treatment
# Tree depth 1
plot(grf_est1$tree1,leaf.labels=c("Control","Treat"))
Code
# Tree depth 2
plot(grf_est1$tree2,leaf.labels=c("Control","Treat"))

1.4 Forestsearch with depth=2 (maxk = 2)

Code
# Setup parallel processing
library(doFuture)
Warning: package 'doFuture' was built under R version 4.5.2
Warning: package 'future' was built under R version 4.5.2
Code
library(doRNG)

registerDoFuture()
registerDoRNG()

system.time({fs <- forestsearch(df.analysis,  confounders.name = confounders.name,
                                outcome.name = "time_months", treat.name = "hormon", event.name = "status", id.name = "id",
                                potentialOutcome.name = NULL, 
                                df.test = NULL,
                                flag_harm.name = NULL,
                                hr.threshold = 1.0, hr.consistency = 0.9, 
                                pconsistency.threshold = 0.90, stop_threshold = 0.95,
                                sg_focus = "hr", max_subgroups_search = 10, use_twostage = TRUE,
                                parallel_args = list(plan="multisession", workers = 13, show_message = TRUE),
                                showten_subgroups = TRUE, details=TRUE,
                                conf_force = NULL,
                                cut_type = "default", use_grf = TRUE, plot.grf = FALSE, use_lasso = FALSE,
                                maxk = 2, fs.splits = 100,
                                n.min = 60, d0.min = 12, d1.min = 12,
                                plot.sg = TRUE, by.risk = 6)
})

=== Two-Stage Consistency Evaluation Enabled ===
Stage 1 screening splits: 30 
Maximum total splits: 100 
Batch size: 20 
================================================

GRF stage for cut selection with dmin, tau = 12 0.6 
tau, maxdepth = 46.75811 2 
   leaf.node control.mean control.size control.se depth
1          2         6.49        82.00       3.34     1
2          3        -4.10       604.00       1.06     1
11         4        -7.90       112.00       2.81     2
21         5         3.86       177.00       1.87     2
4          7        -5.89       356.00       1.33     2

Selected subgroup:
  leaf.node control.mean control.size control.se depth
1         2         6.49        82.00       3.34     1

GRF subgroup found
Terminating node at max.diff (sg.harm.id):
[1] "er <= 0"

All splits:
[1] "er <= 0"   "age <= 50" "age <= 43"
GRF cuts identified: 3 
  Cuts: er <= 0, age <= 50, age <= 43 
# of continuous/categorical characteristics 5 2 
Continuous characteristics: age size nodes pgr er 
Categorical characteristics: meno grade3 
Default cuts included (1st 20) age <= mean(age) age <= median(age) age <= qlow(age) age <= qhigh(age) size <= mean(size) size <= median(size) size <= qlow(size) size <= qhigh(size) nodes <= mean(nodes) nodes <= median(nodes) nodes <= qlow(nodes) nodes <= qhigh(nodes) pgr <= mean(pgr) pgr <= median(pgr) pgr <= qlow(pgr) pgr <= qhigh(pgr) er <= mean(er) er <= median(er) er <= qlow(er) er <= qhigh(er) 
Categorical: meno grade3 
Factors per GRF: er <= 0 age <= 50 age <= 43 
Initial GRF cuts included er <= 0 age <= 50 age <= 43 

===== CONSOLIDATED CUT EVALUATION (IMPROVED) =====
Evaluating 25 cut expressions once and caching...
Cut evaluation summary:
  Total cuts:  25 
  Valid cuts:  25 
  Errors:  0 
✓ All 25 factors validated as 0/1
===== END CONSOLIDATED CUT EVALUATION =====

# of candidate subgroup factors= 25 
 [1] "er <= 0"      "age <= 50"    "age <= 43"    "age <= 53.1"  "age <= 53"   
 [6] "age <= 46"    "age <= 61"    "size <= 29.3" "size <= 25"   "size <= 20"  
[11] "size <= 35"   "nodes <= 5"   "nodes <= 3"   "nodes <= 1"   "nodes <= 7"  
[16] "pgr <= 110"   "pgr <= 32.5"  "pgr <= 7"     "pgr <= 131.8" "er <= 96.3"  
[21] "er <= 36"     "er <= 8"      "er <= 114"    "meno"         "grade3"      
Number of possible configurations (<= maxk): maxk = 2 , # combinations = 1275 
Events criteria: control >= 12 , treatment >= 12 
Subgroup search completed in 0.02 minutes
Found 74 subgroup candidate(s)
# of candidate subgroups (meeting all criteria) = 74 
Random seed set to: 8316951 
Two-stage parameters:
  n.splits.screen: 30 
  screen.threshold: 0.763 
  batch.size: 20 
  conf.level: 0.95 
Removed 6 near-duplicate subgroups
# of unique initial candidates: 68 
# Restricting to top stop_Kgroups = 10 
# of candidates to evaluate: 10 
# Early stop threshold: 0.95 

================================================================================ 
TOP 10 CANDIDATE SUBGROUPS FOR CONSISTENCY EVALUATION
Sorted by: hr 
================================================================================ 

Rank   HR        N       Events  K    Subgroup Definition
-------------------------------------------------------------------------------- 
1      2.537     61      34      2    er <= 0 & size <= 35
2      2.222     75      41      2    er <= 0 & pgr <= 32.5
3      2.164     68      38      2    er <= 0 & NOT(age <= 43)
4      2.054     61      35      2    er <= 0 & NOT(size <= 20)
5      1.992     64      34      2    er <= 0 & pgr <= 7
6      1.951     82      45      1    er <= 0
7      1.725     84      46      2    er <= 8 & NOT(meno)
8      1.722     76      47      2    NOT(size <= 29.3) & er <= 8
9      1.710     72      39      2    grade3 & pgr <= 7
10     1.707     71      36      2    age <= 50 & pgr <= 7
-------------------------------------------------------------------------------- 
Parallel config: workers = 13 , batch_size = 1 
Batch 1 / 10 : candidates 1 - 1 

==================================================
EARLY STOP TRIGGERED (batch 1 )
  Candidate: 1 of 10 
  Pcons: 0.99 >= 0.95 
==================================================

Evaluated 1 of 10 candidates (early stop) 
1 subgroups passed consistency threshold

*** Subgroup found: {er <= 0} {size <= 35} 
% consistency criteria met= 0.99 
SG focus = hr 
Seconds and minutes forestsearch overall = 3.16 0.0527 
Consistency algorithm used: twostage 
   user  system elapsed 
  4.298   0.112   3.166 
Code
plan("sequential")

# Results for estimation (training) data, which_df = "est" is default
res_tabs <- sg_tables(fs, ndecimals = 3, which_df = "est")
res_tabs$sg10_out
Identified Subgroups
Two-factor subgroups (maxk=2)
Factor 1 Factor 2 N Events E1 HR Pcons
{er <= 0} {size <= 35} 61 34 15 2.537 0.990
Search Configuration: Single-factor candidates (L) = 50; Maximum combinations evaluated = 1,275; Search depth (maxk) = 2
Search Results: Candidate subgroups found = 74; Maximum HR estimate = 2.54
Note: E1 = events in treatment arm; Pcons = consistency proportion
Code
res_tabs$tab_estimates
Treatment Effect Estimates
Training data estimates
Subgroup n n1 events m1 m0 RMST HR (95% CI)
ITT 686 (100.0%) 246 (35.9%) 299 (43.6%) 66.3 50.2 7.8 0.69 (0.54, 0.89)
Questionable1 61 (8.9%) 23 (37.7%) 34 (55.7%) 18.5 48 -19 2.54 (1.25, 5.17)
Recommend 625 (91.1%) 223 (35.7%) 265 (42.4%) 66.7 52.2 9.6 0.61 (0.47, 0.79)
1 Identified subgroup : {er <= 0} & {size <= 35}

1.5 Bootstrap Inference

Code
# patchhwork needed for a combined bootstrap plot (otherwise if not available or very few bootstraps do not produce)

library(patchwork)

# Number of bootstrap samples
NB <- 3
system.time({fs_bc <- forestsearch_bootstrap_dofuture(
  fs.est = fs, 
  nb_boots = NB, 
  show_three = FALSE, 
  details = TRUE)
})
Ystar matrix generated should be 'boots x N': 3 x 686

ForestSearch parameters for bootstrap iterations:
  - sg_focus: hr 
  - maxk: 2 
  - fs.splits: 100 
  - max_subgroups_search: 10 
  - hr.threshold: 1 
  - hr.consistency: 0.9 
  - pconsistency.threshold: 0.9 
  - n.min: 60 
  - use_twostage: TRUE 
  - use_lasso: FALSE 
  - use_grf: TRUE 
  Bootstrap-specific overrides:
  - grf_res: NULL (forces re-selection)
  - grf_cuts: NULL (forces re-selection)
  - parallel_args: sequential (prevents nested parallelism)
  - details: FALSE (suppressed in workers)
  - plot.sg: FALSE
  - plot.grf: FALSE

=== Bootstrap Analysis Complete ===
Success rate: 100.0% (3/3)

H (Questionable) Estimates:
  Unadjusted:       2.54 (1.25,5.17) 
  Bias-corrected:  2.35 (0.12,46.19) 

Hc (Recommend) Estimates:
  Unadjusted:       0.61 (0.47,0.79) 
  Bias-corrected:  0.55 (0.24,1.27) 
===================================
   user  system elapsed 
  1.510   0.077   5.317 
Code
plan("sequential")

1.5.1 Diagnostics and Summaries

Code
summaries <- summarize_bootstrap_results(
      sgharm = fs$sg.harm,
      boot_results = fs_bc,
      create_plots = TRUE,
      est.scale = "hr"
    )

===============================================================
           BOOTSTRAP ANALYSIS SUMMARY                          
===============================================================

IDENTIFIED SUBGROUP:
-------------------------------------------------------------
  H: {er <= 0} & {size <= 35}

BOOTSTRAP SUCCESS METRICS:
-------------------------------------------------------------
  Total iterations:              3
  Successful subgroup ID:        3 (100.0%)
  Failed to find subgroup:       0 (0.0%)

TIMING ANALYSIS:
-------------------------------------------------------------
Overall:
  Total bootstrap time:          0.07 minutes (0.00 hours)
  Average per iteration:         0.02 min (1.4 sec)
  Projected for 1000 boots:      23.19 min (0.39 hrs)
Code
sg_tab <- summaries$table

sg_tab
Treatment Effect by Subgroup
Bootstrap bias-corrected estimates (3 iterations)
Subgroup
Sample Size
Survival
Treatment Effect
N NT Events MedT MedC RMSTd HR
(95% CI)1
HR
(95% CI)2
Qstnbl3 61 (8.9%) 23 (37.7%) 34 (55.7%) 18.5 48 -19 2.54 (1.25, 5.17) 2.35 (0.12,46.19)
Recmnd 625 (91.1%) 223 (35.7%) 265 (42.4%) 66.7 52.2 9.6 0.61 (0.47, 0.79) 0.55 (0.24,1.27)
1 Unadjusted HR: Standard Cox regression hazard ratio with robust standard errors
2 Bias-corrected HR: Bootstrap-adjusted estimate using infinitesimal jackknife method (3 iterations). Corrects for optimism in subgroup selection.
3 Identified subgroup: {er <= 0} & {size <= 35}
Note: Med = Median survival time (months). RMSTd = Restricted mean survival time difference. Subgroup identified in 100.0% of bootstrap samples.
Code
event_summary <- summarize_bootstrap_events(fs_bc, threshold = 12)

=== Bootstrap Event Count Summary ===
Total bootstrap iterations: 3
Event threshold: <12 events

ORIGINAL Subgroup H on BOOTSTRAP samples:
  Control arm <12 events: 0 (0.0%)
  Treatment arm <12 events: 0 (0.0%)
  Either arm <12 events: 0 (0.0%)

ORIGINAL Subgroup Hc on BOOTSTRAP samples:
  Control arm <12 events: 0 (0.0%)
  Treatment arm <12 events: 0 (0.0%)
  Either arm <12 events: 0 (0.0%)

NEW Subgroups found: 3 (100.0%)

NEW Subgroup H* on ORIGINAL data:
  Control arm <12 events: 0 (0.0% of successful)
  Treatment arm <12 events: 0 (0.0% of successful)
  Either arm <12 events: 0 (0.0% of successful)

NEW Subgroup Hc* on ORIGINAL data:
  Control arm <12 events: 0 (0.0% of successful)
  Treatment arm <12 events: 0 (0.0% of successful)
  Either arm <12 events: 0 (0.0% of successful)
Code
summaries$diagnostics_table_gt
Bootstrap Diagnostics Summary
Analysis of 3 bootstrap iterations
Category Metric Value
Success Rate Total iterations 3
Successful 3 (100.0%)
Failed 0 (0.0%)
Success rating Excellent
Subgroup H (Questionable) Observed HR 2.537
Bias-corrected HR 2.346
Bootstrap CV (%) 61.7%
N estimates 3
Subgroup Hc (Recommend) Observed HR 0.608
Bias-corrected HR 0.553
Bootstrap CV (%) 31.1%
N estimates 3
Code
summaries$subgroup_summary$original_agreement
                            Metric                    Value
                            <char>                   <char>
1:      Total bootstrap iterations                        3
2:           Successful iterations                        3
3: Failed iterations (no subgroup)                        0
4:                                                         
5:    Original subgroup definition {er <= 0} & {size <= 35}
6:       Exact match with original                 0 (0.0%)
7:         Different from original               3 (100.0%)
8:   Partial match (shared factor)                1 (33.3%)
Code
summaries$subgroup_summary$factor_presence
  Rank Factor Count  Percent
1    1     er     2 66.66667
3    2    pgr     2 66.66667
2    3 grade3     1 33.33333
4    4   size     1 33.33333
Code
summaries$subgroup_summary$factor_presence_specific
  Rank Base_Factor Factor_Definition Count  Percent
2    1          er         {er <= 0}     1 33.33333
3    2          er         {er <= 8}     1 33.33333
4    3      grade3          {grade3}     1 33.33333
5    4         pgr       {pgr <= 33}     1 33.33333
6    5         pgr        {pgr <= 8}     1 33.33333
1    6        size   !{size <= 29.7}     1 33.33333
Code
summaries$plots$H_distribution
summaries$plots$Hc_distribution

1.6 Forest Search k-fold cross-validation

Code
# 1 iteration of 2-fold cross-validation (in practice use 10-fold and at least 50 (Ksims) iterations)
Ksims <- 1
system.time({fs_ten <- forestsearch_tenfold(fs.est = fs, sims = Ksims, Kfolds = 2, details = FALSE, 
                       parallel_args = list(plan = "multisession", workers = 13, show_message = TRUE))})

plan(sequential)
print(fs_ten$find_summary)
print(fs_ten$sens_summary)
print(head(fs_ten$sens_out))
print(head(fs_ten$find_out))

1.7 Forest Search OOB

Code
# OOB = out-of-bag prediction
# Kfolds = n (default to n-fold cross-validations)
# Kfold = 2 for illustration

fs_OOB <- forestsearch_Kfold(fs.est = fs, details = TRUE, Kfolds = 2,
                             parallel_args = list(plan = "callr", workers = 13, show_message = TRUE))
# Reset workers to single
plan(sequential)
summary_OOB <- forestsearch_KfoldOut(res=fs_OOB, details=TRUE, outall=TRUE)
table(summary_OOB$SGs_found[,1])
table(summary_OOB$SGs_found[,2])
Code
# Define subgroups to display (these could be pre-specified or of post-hoc interest)
subgroups <- list(
 age_gt65 = list(
 subset_expr = "age > 65",
 name = "age > 65",
     type = "reference"
   ),
 age_lt65 = list(
 subset_expr = "age <= 65",
 name = "age <= 65",
     type = "reference"
   ),
pgr_positive = list(
 subset_expr = "pgr > 0",
 name = "pgr > 0",
     type = "reference"
   ),
pgr_negative = list(
 subset_expr = "pgr <= 0",
 name = "pgr <= 0",
     type = "reference"
   )
  )

# If fs_OOB and fs_kfold are provided then CV metrics are displayed for both

# Create the forest plot
 result <- plot_subgroup_results_forestplot(
   fs_results = list(fs.est = fs, fs_bc = fs_bc, fs_OOB = NULL, fs_kfold = NULL),
   df_analysis = df.analysis,
   subgroup_list = subgroups,
   outcome.name = "time_months",
   event.name = "status",
   treat.name = "hormon",
   E.name = "Hormon",
   C.name = "CT",
   ci_column_spaces = 25
 )

# Display the plot
plot(result$plot)

Code
# Calculate elapsed time
t_elapsed <- (proc.time() - t_start)["elapsed"]
cat("Elapsed:", t_elapsed, "seconds\n")
Elapsed: 10.019 seconds